1 Setup

1.1 Import necessary R packages

library(enrichR)
library(DT)
library(viridis)
library(ggVennDiagram)
library(patchwork)
library(tidyverse)


devtools::load_all(".")

1.2 Load scRNA-seq datasets:

pbmc_lupus <- readRDS("~/popuDE/reproducibility/batch_correction/pbmc_lupus_bc.rds")
pbmc_covid <- readRDS("~/popuDE/reproducibility/batch_correction/pbmc_covid_bc.rds")

1.3 Define evaluation functions:

plot.venn <- function(popuDE.res, baseline.res, p.sig, thre.type,
                      logFC.table=NULL, logFC.threshold=0.25, 
                      limits=c(0, 3000)){
  celltypes <- dimnames(baseline.res[[1]])[[2]]
  subjects <- dimnames(baseline.res[[1]])[[3]]
  pic.list <- list()
  if (thre.type == 'fdr'){
      thre.name <- names(baseline.res)[str_ends(names(baseline.res), 'fdr_info')]
    }else if (thre.type == 'pval'){
      thre.name <- names(baseline.res)[str_ends(names(baseline.res), 'pval_info')]
    }else{
      stop("Wrong 'thre.type' argument.")
    }
  bs.name <- str_split_i(thre.name, pattern = '\\.', i=1)
  
for (celltype in celltypes){
  p.markers <- names(which(popuDE.res[[celltype]]$pp.d1 > 1-p.sig))
  
  if (length(subjects) == 1){
    w.markers <- names(which(baseline.res[[thre.name]][,celltype,] < p.sig))
  }else{
    w.markers <- names(which(rowSums(baseline.res[[thre.name]][,celltype,]< p.sig) > 0))
  }
  
  if (!is.null(logFC.table)){
    fc.genes <- names(which(logFC.table[,celltype] > logFC.threshold))
    p.markers <- intersect(p.markers, fc.genes)
    w.markers <- intersect(w.markers, fc.genes)
  }

  
  to.plot.data <- list('Our method'=p.markers, bs.name=w.markers)
  names(to.plot.data) <- c('Our method', bs.name)
  pic <- ggVennDiagram(to.plot.data, label_alpha=0, label_color = "red", label_size = 4, edge_size = 1) + 
    scale_x_continuous(expand = expansion(mult = .2)) + 
    scale_fill_distiller(palette = "YlGnBu", direction = 1, limits = limits) + 
    labs(title=celltype) + 
    theme(text = element_text(size = 18), 
          plot.title=element_text(hjust=0.5, size=18, face='bold'))
  pic.list[[celltype]] <- pic
}

p <- pic.list[[names(pic.list)[1]]]
for (celltype in names(pic.list)[-1]){
  p <- p + pic.list[[celltype]]
}
p + plot_layout(ncol = 2) + 
  plot_annotation(title=str_glue("adj.p.threshold = {p.sig}"), theme = theme(plot.title = element_text(hjust=0.5, size = 20, face='bold')))
}




enrich.sig.genes <- function(popuDE.res, baseline.res, p.sig, is.popuDE.markers, is.baseline.markers){
  celltypes <- dimnames(baseline.res[[1]])[[2]]
  subjects <- dimnames(baseline.res[[1]])[[3]]
  fdr.name <- names(baseline.res)[str_ends(names(baseline.res), 'fdr_info')]
  bs.name <- str_split_i(fdr.name, pattern = '\\.', i=1)
  
  res.list = list()
  for (celltype in celltypes){
    p.markers <- names(which(sort(popuDE.res[[celltype]]$pp.d1, decreasing = T) > 1-p.sig))
    if (length(subjects) == 1){
      w.markers <- names(which(sort(baseline.res[[fdr.name]][,celltype,]) < p.sig))  
    }else{
      w.markers <- names(which(rowSums(baseline.res[[fdr.name]][,celltype,]< p.sig) > 0))
    }
    p.not.bs <- setdiff(p.markers, w.markers)
    bs.not.p <- setdiff(w.markers, p.markers)
    p.and.bs <- intersect(p.markers, w.markers)
    
    number.cutoff <- min(length(bs.not.p), length(p.not.bs))
    cat(celltype, "number of genes cut off:", number.cutoff, "\n")
    
    if (is.popuDE.markers & is.baseline.markers){ # both p-markers and w-markers
      to.enrich.genes <- p.and.bs
    }else if (is.popuDE.markers & !is.baseline.markers){ # p-markers but not w-markers
      if (length(subjects) == 1){
        # for corrected lognormcounts as a whole, 
        # ensure the number of bs-markers == the number of p-markers
        cat("Ensure the number of baseline markers == p-markers\n")
        to.enrich.genes <- p.not.bs[1:number.cutoff]
      }else{
        to.enrich.genes <- p.not.bs
      }
    }else if (!is.popuDE.markers & is.baseline.markers){ # not p-markers but w-markers
      if (length(subjects) == 1){
        # for corrected lognormcounts as a whole, 
        # ensure the number of bs-markers == the number of p-markers
        cat("Ensure the number of baseline markers == p-markers\n")
        to.enrich.genes <- bs.not.p[1:number.cutoff]
      }else{
        to.enrich.genes <- bs.not.p
      }
    }else{
      stop("is.popuDE.markers and is.baseline.markers can not both be FALSE.")
    }
    
    if (length(to.enrich.genes) > 0){
      enriched <- enrichr(to.enrich.genes, c('Human_Gene_Atlas'))
      celltype.res <- enriched$Human_Gene_Atlas[1:10,c(1:4)] %>% 
        format(scientific=T, digits=4) %>% 
        unite(!!sym(celltype), c('Term', 'Adjusted.P.value'), sep = '\n P.adj=') %>% 
        dplyr::select(!!sym(celltype))
      res.list <- append(res.list, celltype.res)
      cat("DE genes of ", celltype, "has been enriched.")
    }
  }
  caption <- str_glue("is.our.method.markers = {is.popuDE.markers},
                       is.{bs.name}.markers = {is.baseline.markers},
                       adj.prob.threshold = {p.sig}")
  
  res.list %>% bind_rows() %>%
  datatable(extensions = 'Buttons',
            caption = caption,
            options = list(dom = 'Blfrtip',
                           buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
                           lengthMenu = list(c(10,25,50,-1),
                                             c(10,25,50,"All"))))
}



load.or.run <- function(our.method, bs.method, dataset, sub.rep, ct.rep, per.subject){
  norm.rep <- if (per.subject) NULL else {"scMerge_logcounts"}
  # norm.rep <- ifelse(per.subject, NULL, "scMerge_logcounts")
  count.type <- ifelse(per.subject, 'uncorrected_normcounts', 'corrected_lognormcounts')
  save.path <- file.path('result', dataset, count.type)
  if (!dir.exists(save.path)) dir.create(save.path)
  bs.file.prefix <- ifelse(per.subject, 'per_subject.rds', 'as_whole.rds')
  bs.res.file <- paste(bs.method, bs.file.prefix, sep = '_')
  bs.res.path <- file.path(save.path, bs.res.file)
  our.res.file <- paste0(our.method, '.rds')
  our.res.path <- file.path(save.path, our.res.file)
  
  
  if (bs.res.file %in% list.files(save.path)){bs.res <- readRDS(bs.res.path)}else{
    bs.res <- runBaselineMethod(
      get(dataset), per.subject=per.subject, numCores=20, celltype.rep=ct.rep,
      subject.rep=sub.rep, method=bs.method, use.norm.rep=norm.rep,
      log.input=!per.subject)
    saveRDS(bs.res, bs.res.path)
  }
  
  if (our.res.file %in% list.files(save.path)){popuDE.res <- readRDS(our.res.path)}else{
    popuDE.res <- popuDE(
      get(dataset), effect_thres=0, tol=1e-5, numCores=20, celltype=ct.rep,
      subject=sub.rep, use.norm.rep=norm.rep, 
      log.input=!per.subject)
    saveRDS(popuDE.res, our.res.path)
  }
  return(list(our=popuDE.res, bs=bs.res))
}

2 PBMC-SLE, uncorrected per subject (original results)

res <- load.or.run('popuDE', 'wilcox', 'pbmc_lupus', 'subject', 'celltype', per.subject=TRUE)
plot.venn(res$our, res$bs, p.sig=0.05, thre.type='fdr', limits=c(0, 5500))

enrich.sig.genes(res$our, res$bs, p.sig=0.05, is.popuDE.markers=T, is.baseline.markers=F)
## B cells number of genes cut off: 38 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  B cells has been enriched.CD14+ Monocytes number of genes cut off: 0 
## CD4 T cells number of genes cut off: 151 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD4 T cells has been enriched.CD8 T cells number of genes cut off: 174 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD8 T cells has been enriched.Dendritic cells number of genes cut off: 0 
## FCGR3A+ Monocytes number of genes cut off: 0 
## NK cells number of genes cut off: 391 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  NK cells has been enriched.

3 PBMC-SLE, corrected as whole

res <- load.or.run('popuDE', 'wilcox', 'pbmc_lupus', 'subject', 'celltype', per.subject=FALSE)
plot.venn(res$our, res$bs, p.sig=0.05, thre.type='fdr', limits=c(0, 3500))

enrich.sig.genes(res$our, res$bs, p.sig=0.05, is.popuDE.markers=T, is.baseline.markers=F)
## B cells number of genes cut off: 443 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  B cells has been enriched.CD14+ Monocytes number of genes cut off: 455 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD14+ Monocytes has been enriched.CD4 T cells number of genes cut off: 1310 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD4 T cells has been enriched.CD8 T cells number of genes cut off: 503 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD8 T cells has been enriched.Dendritic cells number of genes cut off: 36 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Dendritic cells has been enriched.FCGR3A+ Monocytes number of genes cut off: 198 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  FCGR3A+ Monocytes has been enriched.NK cells number of genes cut off: 1054 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  NK cells has been enriched.
enrich.sig.genes(res$our, res$bs, p.sig=0.05, is.popuDE.markers=F, is.baseline.markers=T)
## B cells number of genes cut off: 443 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  B cells has been enriched.CD14+ Monocytes number of genes cut off: 455 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD14+ Monocytes has been enriched.CD4 T cells number of genes cut off: 1310 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD4 T cells has been enriched.CD8 T cells number of genes cut off: 503 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD8 T cells has been enriched.Dendritic cells number of genes cut off: 36 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Dendritic cells has been enriched.FCGR3A+ Monocytes number of genes cut off: 198 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  FCGR3A+ Monocytes has been enriched.NK cells number of genes cut off: 1054 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  NK cells has been enriched.

4 PBMC-COVID19, uncorrected per subject

res <- load.or.run('popuDE', 'wilcox', 'pbmc_covid', 'sampleID', 'majorType', per.subject=TRUE)
plot.venn(res$our, res$bs, p.sig=0.05, thre.type='fdr', limits=c(0, 8000))

enrich.sig.genes(res$our, res$bs, p.sig=0.05, is.popuDE.markers=T, is.baseline.markers=F)
## B number of genes cut off: 477 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  B has been enriched.CD4 number of genes cut off: 193 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD4 has been enriched.CD8 number of genes cut off: 1115 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD8 has been enriched.DC number of genes cut off: 2 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  DC has been enriched.Mega number of genes cut off: 35 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Mega has been enriched.Mono number of genes cut off: 1 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Mono has been enriched.NK number of genes cut off: 862 
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  NK has been enriched.Plasma number of genes cut off: 0

5 PBMC-COVID19, corrected as whole

res <- load.or.run('popuDE', 'wilcox', 'pbmc_covid', 'sampleID', 'majorType', per.subject=FALSE)
plot.venn(res$our, res$bs, p.sig=0.05, thre.type='fdr', limits=c(0, 6000))

enrich.sig.genes(res$our, res$bs, p.sig=0.05, is.popuDE.markers=T, is.baseline.markers=F)
## B number of genes cut off: 1613 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  B has been enriched.CD4 number of genes cut off: 2314 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD4 has been enriched.CD8 number of genes cut off: 1787 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD8 has been enriched.DC number of genes cut off: 119 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  DC has been enriched.Mega number of genes cut off: 627 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Mega has been enriched.Mono number of genes cut off: 1538 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Mono has been enriched.NK number of genes cut off: 1953 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  NK has been enriched.Plasma number of genes cut off: 197 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Plasma has been enriched.
enrich.sig.genes(res$our, res$bs, p.sig=0.05, is.popuDE.markers=F, is.baseline.markers=T)
## B number of genes cut off: 1613 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  B has been enriched.CD4 number of genes cut off: 2314 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD4 has been enriched.CD8 number of genes cut off: 1787 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  CD8 has been enriched.DC number of genes cut off: 119 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  DC has been enriched.Mega number of genes cut off: 627 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Mega has been enriched.Mono number of genes cut off: 1538 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Mono has been enriched.NK number of genes cut off: 1953 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  NK has been enriched.Plasma number of genes cut off: 197 
## Ensure the number of baseline markers == p-markers
## Uploading data to Enrichr... Done.
##   Querying Human_Gene_Atlas... Done.
## Parsing results... Done.
## DE genes of  Plasma has been enriched.